*****************************************************************
* Replication directory for                                   ***
* Prime locations                                             ***
* by Gabriel M. Ahlfeldt, Thilo N.H. Albers, Kristian Behrens ***
* Published in American Economic Review: Insights             ***
*****************************************************************
* 01/2025
* Stata
version 17.0

* This do file generates MSA grid shapefiles with identifiers for our GitHub toolkit
* It requires a Python working environment with packages mentioned at the beginning of the Python code
* Should the code not run through in Stata, the part between "python:" and "end" can be executed in Python
* In this case, you must manually set the root directory in line 28. 

* Generate folder for new grids
	capture mkdir "$temp/NEWGRIDS"

	cd "$root"
* Start Python code
python:	

import os
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon

# Set root directory to Stata's current working directory
root_directory = os.getcwd()

# File path for the MSA-list with CBSAFP identifiers (relative to root_directory)
msa_list_file = os.path.join(root_directory, r'_Data/US METRO DATA/MSA-list.csv')

# Read the list of identifiers (CBSAFP) from the CSV file
msa_df = pd.read_csv(msa_list_file)
identifiers = msa_df['cbsafp'].tolist()  # Extract the list of identifiers

# File path templates (relative to root_directory)
input_folder = os.path.join(root_directory, r'_Data/US METRO DATA/GIS Data/US METRO GRIDS')
output_folder = os.path.join(root_directory, r'_Work/TEMP/NEWGRIDS')

# Define the projection (EPSG:4326 corresponds to WGS 84 in decimal degrees)
crs = "EPSG:4326"

# Loop over each identifier and process files
for identifier in identifiers:
    # Convert identifier to string if it's numeric (to ensure correct file path formatting)
    identifier_str = str(identifier)
    
    # Construct the input file path (corrected file name to GRID_XXXXX_final.csv)
    input_file = os.path.join(input_folder, f'GRID_{identifier_str}_final.csv')
    
    # Check if the input file exists to avoid errors
    if not os.path.exists(input_file):
        print(f"File not found: {input_file}")
        continue  # Skip to the next identifier
    
    # Load the data from the CSV file
    df = pd.read_csv(input_file)
    
    # Rename 'ClusterID' to 'cell_id'
    df.rename(columns={'ClusterID': 'cell_id'}, inplace=True)
    
    # Create a Polygon for each observation using the min/max x and y values
    df['geometry'] = df.apply(
        lambda row: Polygon([
            (row['min_x'], row['max_y']),  # Upper Left
            (row['max_x'], row['max_y']),  # Upper Right
            (row['max_x'], row['min_y']),  # Lower Right
            (row['min_x'], row['min_y']),  # Lower Left
            (row['min_x'], row['max_y'])   # Close the polygon
        ]), axis=1
    )
    
    # Convert to GeoDataFrame and assign the CRS (EPSG:4326 for lat/lon in degrees)
    gdf = gpd.GeoDataFrame(df, geometry='geometry', crs=crs)
    
    # Construct the output shapefile path
    shapefile_path = os.path.join(output_folder, f'GRID_{identifier_str}.shp')
    
    # Save the GeoDataFrame as a shapefile
    gdf.to_file(shapefile_path)
    
    # Confirm the shapefile has been saved
    print(f"Shapefile saved at: {shapefile_path}")
end
* Script ends
